package module3

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions.{col, concat, explode, from_json, lit, regexp_replace, sha2, split}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

class OtherOperations {

  /**
   * Kilka statystyk odnośnie pizzy w USA
   */
  def pizzaOperations(): Unit ={
    val spark: SparkSession = SparkSession.builder()
      .appName("fundament-sparka")
      .master("local")
      .getOrCreate()

    val pizzaRawDF: Dataset[Row] = spark.read
      .option("header", "true")
      .csv("pizza_data.csv")

    pizzaRawDF.show()

    val pizzaCleanDF: Dataset[Row] = pizzaRawDF.withColumn("Price", regexp_replace(col("Price"), "[$,]","")
      .cast(DataTypes.DoubleType))

    val companiesWithAvgPricesDF: Dataset[Row] = pizzaCleanDF.groupBy("Company").avg("Price") // tutaj można mix oraz min

    val companiesWithSumPricesDF: Dataset[Row] = pizzaCleanDF.groupBy("Company").sum("Price")

    val companiesWithPizzaCountDF: Dataset[Row] = pizzaCleanDF.select("Company", "Pizza Name")
      .distinct()
      .groupBy("Company").count()
      .orderBy(col("count").desc)

    companiesWithAvgPricesDF.show()
    companiesWithSumPricesDF.show()
    companiesWithPizzaCountDF.show()
  }

  /**
   * wywal nulle,
   * ile jest filmów przypisanych do poszczególnych gatunków?,
   * zahashuj reżyserów : - )
   */
  def netflixOperations(): Unit ={
    val spark: SparkSession = SparkSession.builder()
      .appName("fundament-sparka")
      .master("local")
      .getOrCreate()

    val netflixDF: Dataset[Row] = spark.read
      .option("header", "true")
      .csv("netflix_titles.csv")

    netflixDF.show(false)
    netflixDF.printSchema()

    val netflixWithoutNullsDF: Dataset[Row] = netflixDF.na.fill("N.A.")

    netflixWithoutNullsDF.show(false)

    val listedInStatsDF: Dataset[Row] = netflixWithoutNullsDF.withColumn("listed_in", split(col("listed_in"), ","))
      .select(col("show_id"), explode(col("listed_in")).as("singleListedIn"))
      .groupBy("singleListedIn")
      .count()

    listedInStatsDF.show(false)
    listedInStatsDF.printSchema()

    val hashedDirectorsDF: Dataset[Row] = netflixWithoutNullsDF.select("show_id", "director")
      .withColumn("hashed_director", sha2(col("director"),512))

    hashedDirectorsDF.show(false)

    hashedDirectorsDF.write
      .parquet("hashed_netflix_directors.parquet")
  }

  def simpleJson(): Unit ={
    val spark: SparkSession = SparkSession.builder()
      .appName("fundament-sparka")
      .master("local")
      .getOrCreate()

    import spark.implicits._

    val dataSchema = StructType(
      List(
        StructField("name", DataTypes.StringType, false),
        StructField("time", DataTypes.TimestampType, false)
      )
    )

    val jsonDF = Seq(("{\"name\": \"marek\",\"time\": 1469501675}"), ("{\"name\": \"kasia\",\"time\": 1469501623}")).toDF("value")
    val fromJsonDF = jsonDF.withColumn("jsonData", from_json(col("value"), dataSchema)).select("jsonData.*")
    jsonDF.show(false)
    jsonDF.printSchema()
    jsonDF.withColumn("jsonData", from_json(col("value"), dataSchema)).show(false)
    jsonDF.withColumn("jsonData", from_json(col("value"), dataSchema)).printSchema()
    fromJsonDF.show(false)
  }

  /**
   * Połącz dwie kolumny w jedną (imię i nazwisko),
   * usuń niepotrzebne kolumny,
   * odfiltruj niepełnoletnich.
   */
  def structureOperations(): Unit ={
    val spark: SparkSession = SparkSession.builder()
      .appName("fundament-sparka")
      .master("local")
      .getOrCreate()

    import spark.implicits._

    val people: Seq[(String, String, String, Int)] = Seq(("1", "marek", "czuma", 28), ("2", "ania", "kowalska", 30), ("3", "magda", "nowak", 28),
      ("4", "jan", "kowalski", 15), ("5", "jozef", "czuma", 25), ("6", "ignacy", "czuma", 35),
      ("7", "laura", "moscicka", 68), ("8", "zuzanna", "birecka", 12), ("9", "roman", "kowalski", 45),
      ("10", "marek", "kowalski", 68), ("11", "ignacy", "nowak", 43), ("12", "ania", "nowak", 33),
      ("13", "laura", "czuma", 6), ("14", "karol", "birecki", 21), ("15", "karol", "nowak", 43),
      ("16", "jan", "moscicki", 33), ("17", "jan", "birecki", 36), ("18", "andrzej", "kowalski", 82))

    val peopleDF: Dataset[Row] = people.toDF("id", "firstName", "lastName", "age")

    val peopleWithFullNameDF: Dataset[Row] = peopleDF.withColumn("fullName", concat(col("firstName"), lit(" "), col("lastName")))
      .drop("firstName", "lastName")

    val adultDF: Dataset[Row] = peopleWithFullNameDF.filter(col("age").geq(18))

    adultDF.show()
  }

}
